{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comment on the basic tutorial for text classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overview\n",
    "This tutorial is extracted from https://github.com/bentrevett/pytorch-sentiment-analysis. Before proceeding, make sure to install the required library. \n",
    "\n",
    "In this follow-up tutorial, we examine how we can improve the code from previous tutorial.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "tokenizer = word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torchtext import data\n",
    "from torchtext import datasets\n",
    "\n",
    "SEED = 1234\n",
    "\n",
    "torch.manual_seed(SEED)\n",
    "torch.backends.cudnn.deterministic = True\n",
    "\n",
    "TEXT = data.Field(tokenize = tokenizer, include_lengths = True)\n",
    "LABEL = data.LabelField(dtype = torch.float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext import datasets\n",
    "\n",
    "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
    "### Here we already have a problem. The dataset is already extracted from a built-in data set. So how can we use out own dataset?\n",
    "### See https://torchtext.readthedocs.io/en/latest/data.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "train_data, valid_data = train_data.split(random_state = random.seed(SEED))\n",
    "### 1. The split is a little bit strange\n",
    "### 2. Usually to add to stability, it is better to use cross validation, unless the dataset is huge."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_VOCAB_SIZE = 25000 ### This can be tuned. \n",
    "\n",
    "TEXT.build_vocab(train_data, \n",
    "                 max_size = MAX_VOCAB_SIZE, \n",
    "                 vectors = \"glove.6B.300d\", \n",
    "                 unk_init = torch.Tensor.normal_)\n",
    "\n",
    "LABEL.build_vocab(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 64 ### This can be tuned\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  ### This is only possible for single GPU training\n",
    "\n",
    "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
    "    (train_data, valid_data, test_data), \n",
    "    batch_size = BATCH_SIZE,\n",
    "    sort_within_batch = True,\n",
    "    device = device)  ### Typically when training we use Dataset and Dataloader (https://pytorch.org/tutorials/beginner/data_loading_tutorial.html)\n",
    "### If we use this, it is hard to combine it to other code. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn\n",
    "\n",
    "class RNN(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, \n",
    "                 bidirectional, dropout, pad_idx):\n",
    "        \n",
    "        super().__init__()\n",
    "        \n",
    "        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)\n",
    "        \n",
    "        self.rnn = nn.LSTM(embedding_dim, \n",
    "                           hidden_dim, \n",
    "                           num_layers=n_layers, \n",
    "                           bidirectional=bidirectional, \n",
    "                           dropout=dropout)\n",
    "        \n",
    "        self.fc = nn.Linear(hidden_dim * 2, output_dim)\n",
    "        \n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "        \n",
    "    def forward(self, text, text_lengths):\n",
    "        \n",
    "        embedded = self.embedding(text)\n",
    "        \n",
    "        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)\n",
    "        \n",
    "        packed_output, (hidden, cell) = self.rnn(packed_embedded)\n",
    "        \n",
    "        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)\n",
    "        \n",
    "        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) \n",
    "                \n",
    "        return self.fc(hidden)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_DIM = len(TEXT.vocab)\n",
    "EMBEDDING_DIM = 300\n",
    "HIDDEN_DIM = 256 # This can be tuned.\n",
    "OUTPUT_DIM = 1\n",
    "N_LAYERS = 2 # This needs to be discussed. Is it a good idea to stack two LSTM together?\n",
    "BIDIRECTIONAL = True \n",
    "DROPOUT = 0.2 # This can be tuned\n",
    "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n",
    "\n",
    "model = RNN(INPUT_DIM, \n",
    "            EMBEDDING_DIM, \n",
    "            HIDDEN_DIM, \n",
    "            OUTPUT_DIM, \n",
    "            N_LAYERS, \n",
    "            BIDIRECTIONAL, \n",
    "            DROPOUT, \n",
    "            PAD_IDX)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pretrained_embeddings = TEXT.vocab.vectors\n",
    "model.embedding.weight.data.copy_(pretrained_embeddings)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]\n",
    "\n",
    "model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)\n",
    "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.optim as optim\n",
    "\n",
    "optimizer = optim.Adam(model.parameters()) # This is serious offence. Why there is no tunning parameter here? What about learning rate. Also, why is the embedding also trained here? \n",
    "criterion = nn.BCEWithLogitsLoss()\n",
    "\n",
    "model = model.to(device)\n",
    "criterion = criterion.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def binary_accuracy(preds, y):\n",
    "    rounded_preds = torch.round(torch.sigmoid(preds))\n",
    "    correct = (rounded_preds == y).float() #convert into float for division \n",
    "    acc = correct.sum() / len(correct)\n",
    "    return acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(model, iterator, optimizer, criterion):\n",
    "    \n",
    "    epoch_loss = 0\n",
    "    epoch_acc = 0\n",
    "    \n",
    "    model.train()\n",
    "    \n",
    "    for batch in iterator:\n",
    "        \n",
    "        optimizer.zero_grad()\n",
    "        \n",
    "        text, text_lengths = batch.text\n",
    "        \n",
    "        predictions = model(text, text_lengths).squeeze(1)\n",
    "        \n",
    "        loss = criterion(predictions, batch.label)\n",
    "        \n",
    "        acc = binary_accuracy(predictions, batch.label)\n",
    "        \n",
    "        loss.backward()\n",
    "        \n",
    "        optimizer.step()\n",
    "        \n",
    "        epoch_loss += loss.item()\n",
    "        epoch_acc += acc.item()\n",
    "        \n",
    "    return epoch_loss / len(iterator), epoch_acc / len(iterator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(model, iterator, criterion):\n",
    "    \n",
    "    epoch_loss = 0\n",
    "    epoch_acc = 0\n",
    "    \n",
    "    model.eval()\n",
    "    \n",
    "    with torch.no_grad():\n",
    "    \n",
    "        for batch in iterator:\n",
    "\n",
    "            text, text_lengths = batch.text\n",
    "            \n",
    "            predictions = model(text, text_lengths).squeeze(1)\n",
    "            \n",
    "            loss = criterion(predictions, batch.label)\n",
    "            \n",
    "            acc = binary_accuracy(predictions, batch.label)\n",
    "\n",
    "            epoch_loss += loss.item()\n",
    "            epoch_acc += acc.item()\n",
    "        \n",
    "    return epoch_loss / len(iterator), epoch_acc / len(iterator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "def epoch_time(start_time, end_time):\n",
    "    elapsed_time = end_time - start_time\n",
    "    elapsed_mins = int(elapsed_time / 60)\n",
    "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
    "    return elapsed_mins, elapsed_secs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "N_EPOCHS = 5\n",
    "\n",
    "best_valid_loss = float('inf')\n",
    "\n",
    "for epoch in range(N_EPOCHS):\n",
    "\n",
    "    start_time = time.time()\n",
    "    \n",
    "    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)\n",
    "    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) # To validate model at the end of each epoch is also serious offence. \n",
    "    \n",
    "    end_time = time.time()\n",
    "\n",
    "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
    "    \n",
    "    if valid_loss < best_valid_loss: # This is serious offence here. It is a VERY bad idea to only save the best model. When you want to babysit the training process, it is usually a bad idea to start from the best model. \n",
    "        best_valid_loss = valid_loss\n",
    "        torch.save(model.state_dict(), 'model.pt') # The estimator state should also be saved (as well as scheduler if available)\n",
    "    \n",
    "    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
    "    print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
    "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_state_dict(torch.load('model.pt'))\n",
    "\n",
    "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n",
    "\n",
    "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Questions\n",
    "1. What if we want to combine character-level embedding, word-embedding and sentence embedding?\n",
    "2. What if we want to use CNN?\n",
    "3. What if we want to combine CNN with LSTM?"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
